##Predicting Academic Performance Index (API) Score 
#from California Department of Eduaction 
#through Linear Regression


library(car) #cross validation
## Loading required package: carData
library(ggplot2) #ggplot
library(lattice)
library(MASS)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret) #linear
library(leaps) #regsubsets
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:MASS':
## 
##     cement
## The following object is masked from 'package:datasets':
## 
##     rivers
library(e1071)
library(sandwich) 
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:Matrix':
## 
##     expand
library(purrr) #keep
## 
## Attaching package: 'purrr'
## The following objects are masked from 'package:foreach':
## 
##     accumulate, when
## The following object is masked from 'package:caret':
## 
##     lift
## The following object is masked from 'package:car':
## 
##     some
library(ModelMetrics) #mse
## 
## Attaching package: 'ModelMetrics'
## The following object is masked from 'package:glmnet':
## 
##     auc
## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, precision, recall, sensitivity, specificity
## The following object is masked from 'package:base':
## 
##     kappa
library(corrplot)
## corrplot 0.84 loaded
set.seed(1234)

Base05<- read.csv("/Users/DavidKwon/Desktop/Yongbock/API Score Prj/API 2005 Base Data.csv")


#"The 2005 API (Academic Performance Index) summarizes a school's, an LEA's (local educational agency, is a school district or county office of education), or the state's performance on the spring 2005 Standardized Testing and Reporting (STAR) Program and 2005 California High School Exit Examination (CAHSEE)."
#"The 2005 API Base summarizes a subgroup's (e.g STAR Program scores, Ethnic/racial subgroup/ parents' degrees/ English Learners ...) performance on the spring 2005 STAR Program and 2005 CAHSEE. It serves as the baseline score, or starting point, of performance of that subgroup"

#Therefore, we are going to remove those variables that are created by our dependent variable such as SIM_RANK, ST_RANK
#we are also going to remove one of the variable, RTYPE or STYPE, since they are overlapping

Base05<-select(Base05,c(-"RTYPE", -"SIM_RANK",-"ST_RANK"))

#find NAs rows of the dependent variables and removes the rows that includes NAs

Base05<-Base05[-which(is.na(Base05$API05B)),] 


#creating function that finds NAs for each variables 

findingNA<-function(x){
  length(which(is.na(x)))/length(x)
}


for(i in 1:dim(Base05)[2]){
  if(findingNA(Base05[,i])>0.1){
    print(i)
  }
}
## [1] 3
## [1] 4
## [1] 5
## [1] 42
## [1] 46
## [1] 47
## [1] 48
## [1] 66
#we are going to remove the variables that have 10% of the elements is NAs 
#I also removed the "X" variable (school or district code), which is 1

newbase<-Base05[,c(-1, -3, -4, -5, -42, -46, -47, -48, -66)]

new.base<-newbase[complete.cases(newbase),]



#lets explore our dependent variable with ggplot

###histogram
ggplot(data=new.base, aes(new.base$API05B)) + 
  geom_histogram(aes(y=..density..), binwidth=10) +
  geom_density(aes(y=..density..), color="red")+
  labs(title="Histogram for Academic Performance Index 2005 Base", x="API 2005 Base", y="Frequency") +
  geom_vline(xintercept = mean(new.base$API05B), show.legend = TRUE, color="red") +
  geom_vline(xintercept = median(new.base$API05B), show.legend = TRUE, color="blue")

#As the graph shows, we can notice the API05B seems to be normally distributed but it's skewed a little. 
#Our independent factor variables

new.base[1:97] %>% keep(is.factor) %>% gather() %>%
  ggplot(aes(value))+
  facet_wrap(~key,scales="free")+
  geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped

#Our independent numerical variable 
new.base[1:20] %>% keep(is.numeric) %>% gather() %>%
  ggplot(aes(value)) +
  facet_wrap(~key, scales="free")+
  geom_histogram(bins=30)+
  geom_density()

new.base[21:40] %>% keep(is.numeric) %>% gather() %>%
  ggplot(aes(value)) +
  facet_wrap(~key, scales="free")+
  geom_histogram(bins=30)+
  geom_density()

new.base[41:60] %>% keep(is.numeric) %>% gather() %>%
  ggplot(aes(value)) +
  facet_wrap(~key, scales="free")+
  geom_histogram(bins=30)+
  geom_density()

new.base[61:80] %>% keep(is.numeric) %>% gather() %>%
  ggplot(aes(value)) +
  facet_wrap(~key, scales="free")+
  geom_histogram(bins=30)+
  geom_density()

new.base[81:96] %>% keep(is.numeric) %>% gather() %>%
  ggplot(aes(value)) +
  facet_wrap(~key, scales="free")+
  geom_histogram(bins=30)+
  geom_density()

#most of our independent numerical variables are highly skewed
#we might want to perform log tranformation, but I will do later. 
##########splitting training and test datset - cross validation

split<-createDataPartition(y=new.base$API05B,p=0.7,list=FALSE)

training.newbase <- new.base[split,]
test.newbase <- new.base[-split,]

#The first model with whole data
lm<-lm(API05B~., data=training.newbase)
summary(lm)
## 
## Call:
## lm(formula = API05B ~ ., data = training.newbase)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -369.99  -23.30    0.19   23.70  387.98 
## 
## Coefficients: (15 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.060e+02  7.808e+02   0.648 0.516919    
## STYPE2       1.372e+00  4.938e+00   0.278 0.781090    
## STYPE3       4.516e-01  9.240e+00   0.049 0.961025    
## STYPE4       2.763e-01  2.085e+01   0.013 0.989427    
## STYPEE       1.063e-01  4.738e+00   0.022 0.982096    
## STYPEH       4.048e+00  5.705e+00   0.709 0.478061    
## STYPEM       5.841e+00  5.283e+00   1.106 0.268877    
## VALID_NUM   -7.991e-01  3.561e-01  -2.244 0.024862 *  
## AA_NUM      -3.045e-02  2.121e-02  -1.436 0.151152    
## AA_SIGYes   -3.391e+00  2.288e+00  -1.482 0.138345    
## AI_NUM      -6.781e-02  4.610e-02  -1.471 0.141362    
## AI_SIGYes   -8.093e-01  8.016e+00  -0.101 0.919580    
## AS_NUM      -4.403e-02  2.110e-02  -2.086 0.037018 *  
## AS_SIGYes   -1.582e+00  2.395e+00  -0.661 0.508908    
## FI_NUM      -5.377e-02  2.249e-02  -2.390 0.016861 *  
## FI_SIGYes   -3.005e+00  3.826e+00  -0.785 0.432197    
## HI_NUM      -4.091e-02  2.060e-02  -1.986 0.047085 *  
## HI_SIGYes    2.237e+00  2.104e+00   1.063 0.287824    
## PI_NUM      -4.308e-02  5.160e-02  -0.835 0.403800    
## PI_SIGYes   -1.673e+01  9.627e+00  -1.738 0.082243 .  
## WH_NUM      -4.066e-02  2.077e-02  -1.958 0.050328 .  
## WH_SIGYes    5.455e+00  1.823e+00   2.992 0.002781 ** 
## SD_NUM      -4.120e-03  2.866e-03  -1.437 0.150669    
## SD_SIGYes    1.189e+00  2.205e+00   0.539 0.589895    
## EL_NUM       3.784e-03  3.014e-03   1.256 0.209326    
## EL_SIGYes   -4.284e+00  1.925e+00  -2.225 0.026102 *  
## DI_NUM       7.052e-03  1.066e-02   0.662 0.508224    
## DI_SIGYes    3.047e+00  1.991e+00   1.531 0.125913    
## PCT_AA       4.742e-01  1.864e-01   2.544 0.010969 *  
## PCT_AI       1.370e-01  2.284e-01   0.600 0.548628    
## PCT_AS       2.192e+00  1.919e-01  11.422  < 2e-16 ***
## PCT_FI       1.622e+00  2.558e-01   6.338 2.48e-10 ***
## PCT_HI       1.148e+00  1.771e-01   6.483 9.61e-11 ***
## PCT_PI       5.823e-01  5.605e-01   1.039 0.298873    
## PCT_WH       1.537e+00  1.770e-01   8.684  < 2e-16 ***
## MEALS       -4.013e-01  4.415e-02  -9.088  < 2e-16 ***
## P_FDAY       9.904e-02  5.646e-02   1.754 0.079454 .  
## P_GATE       1.141e+00  8.114e-02  14.062  < 2e-16 ***
## P_MIGED     -4.586e-01  7.743e-02  -5.923 3.32e-09 ***
## P_EL        -6.977e-01  6.889e-02 -10.127  < 2e-16 ***
## P_RFEP       8.360e-01  1.051e-01   7.956 2.08e-15 ***
## P_DI        -1.358e+00  7.909e-02 -17.171  < 2e-16 ***
## SMOB        -1.912e+00  4.606e-01  -4.151 3.35e-05 ***
## CBMOB        3.362e-01  4.649e-01   0.723 0.469513    
## DMOB         2.913e-01  1.026e-01   2.839 0.004536 ** 
## PCT_RESP     2.379e-01  3.037e-02   7.835 5.45e-15 ***
## NOT_HSG     -6.839e-01  1.498e-01  -4.565 5.08e-06 ***
## HSG         -3.870e-01  1.069e-01  -3.621 0.000295 ***
## SOME_COL    -4.914e-01  1.017e-01  -4.831 1.39e-06 ***
## COL_GRAD     1.297e-01  1.294e-01   1.002 0.316350    
## GRAD_SCH     6.633e-01  1.780e-01   3.726 0.000196 ***
## AVG_ED       1.448e+01  6.414e+00   2.258 0.023997 *  
## FULL         2.181e-01  8.914e-02   2.447 0.014438 *  
## EMER         2.709e-02  1.222e-01   0.222 0.824590    
## PEN_2       -2.330e+00  1.519e+00  -1.534 0.125145    
## PEN_35      -1.954e+00  1.520e+00  -1.286 0.198534    
## PEN_6       -2.668e+00  1.521e+00  -1.754 0.079396 .  
## PEN_78      -2.391e+00  1.520e+00  -1.573 0.115757    
## PEN_911     -2.663e+00  1.523e+00  -1.749 0.080374 .  
## ENROLL      -9.971e-02  3.069e-02  -3.249 0.001163 ** 
## PARENT_OPT   2.669e-01  9.873e-02   2.703 0.006887 ** 
## TESTED       1.058e-01  3.254e-02   3.253 0.001149 ** 
## VCST_E28    -3.215e-02  4.189e-01  -0.077 0.938814    
## PCST_E28            NA         NA      NA       NA    
## VCST_E911    4.933e-01  1.563e-01   3.155 0.001613 ** 
## PCST_E911           NA         NA      NA       NA    
## CW_CSTE      8.870e+00  7.804e+00   1.137 0.255732    
## VCST_M28     8.547e-01  2.769e-01   3.086 0.002035 ** 
## PCST_M28            NA         NA      NA       NA    
## VCST_M911    3.650e-01  3.518e-01   1.038 0.299519    
## PCST_M911           NA         NA      NA       NA    
## CW_CSTM     -5.585e+00  7.878e+00  -0.709 0.478347    
## VCST_S28     1.336e-02  2.900e-02   0.461 0.645146    
## PCST_S28            NA         NA      NA       NA    
## VCST_S911   -2.576e-03  9.824e-02  -0.026 0.979085    
## PCST_S911           NA         NA      NA       NA    
## CW_CSTS     -4.500e-01  7.616e+00  -0.059 0.952882    
## VCST_H28    -3.701e-02  1.998e-02  -1.852 0.064057 .  
## PCST_H28            NA         NA      NA       NA    
## VCST_H911   -1.312e-02  2.249e-02  -0.583 0.559677    
## PCST_H911           NA         NA      NA       NA    
## CW_CSTH     -3.998e-01  7.626e+00  -0.052 0.958187    
## VNRT_R28    -4.921e+00  6.898e+00  -0.713 0.475597    
## PNRT_R28            NA         NA      NA       NA    
## CW_NRTR      1.778e+00  1.795e+01   0.099 0.921081    
## VNRT_L28    -1.075e+11  1.158e+11  -0.928 0.353298    
## PNRT_L28     3.585e+12  3.862e+12   0.928 0.353298    
## CW_NRTL      1.201e+02  4.214e+01   2.850 0.004389 ** 
## VNRT_S28     6.399e-01  7.996e-01   0.800 0.423522    
## PNRT_S28            NA         NA      NA       NA    
## CW_NRTS     -1.550e+02  4.179e+01  -3.710 0.000209 ***
## VNRT_M28    -2.361e-01  7.557e-01  -0.312 0.754724    
## PNRT_M28            NA         NA      NA       NA    
## CW_NRTM      9.091e+00  1.605e+01   0.566 0.571121    
## VCHS_E911   -6.596e-02  9.927e-02  -0.664 0.506404    
## PCHS_E911           NA         NA      NA       NA    
## CW_CHSE      2.223e+00  7.674e+00   0.290 0.772121    
## VCHS_M911    5.043e-02  9.471e-02   0.532 0.594406    
## PCHS_M911           NA         NA      NA       NA    
## CW_CHSM      4.562e+00  7.642e+00   0.597 0.550600    
## TOT_28              NA         NA      NA       NA    
## TOT_911             NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.11 on 6582 degrees of freedom
## Multiple R-squared:  0.8452, Adjusted R-squared:  0.8432 
## F-statistic: 417.9 on 86 and 6582 DF,  p-value: < 2.2e-16
plot(lm)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

#There's a lot of NAs of Beta, which implies that it's not estimatable. 
#We are going to perform variable selection using regsubsets-backward
###regsubsets - model selection - backward - nvmax=10

###Model Selection through stepwise backward selection from regsubsets
#I let the maximum number of predictors be 10, since it's going to be too complex to interpret or to build a model if we have more than 10 predictors

reg1<-regsubsets(API05B~., data=training.newbase, really.big=TRUE,nvmax=10, method = "backward")
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
## force.in = force.in, : 16 linear dependencies found
## Reordering variables and trying again:
reg.summary<-summary(reg1)

reg.summary
## Subset selection object
## Call: regsubsets.formula(API05B ~ ., data = training.newbase, really.big = TRUE, 
##     nvmax = 10, method = "backward")
## 101 Variables  (and intercept)
##            Forced in Forced out
## STYPE2         FALSE      FALSE
## STYPE3         FALSE      FALSE
## STYPE4         FALSE      FALSE
## STYPEE         FALSE      FALSE
## STYPEH         FALSE      FALSE
## STYPEM         FALSE      FALSE
## VALID_NUM      FALSE      FALSE
## AA_NUM         FALSE      FALSE
## AA_SIGYes      FALSE      FALSE
## AI_NUM         FALSE      FALSE
## AI_SIGYes      FALSE      FALSE
## AS_NUM         FALSE      FALSE
## AS_SIGYes      FALSE      FALSE
## FI_NUM         FALSE      FALSE
## FI_SIGYes      FALSE      FALSE
## HI_NUM         FALSE      FALSE
## HI_SIGYes      FALSE      FALSE
## PI_NUM         FALSE      FALSE
## PI_SIGYes      FALSE      FALSE
## WH_NUM         FALSE      FALSE
## WH_SIGYes      FALSE      FALSE
## SD_NUM         FALSE      FALSE
## SD_SIGYes      FALSE      FALSE
## EL_NUM         FALSE      FALSE
## EL_SIGYes      FALSE      FALSE
## DI_NUM         FALSE      FALSE
## DI_SIGYes      FALSE      FALSE
## PCT_AA         FALSE      FALSE
## PCT_AI         FALSE      FALSE
## PCT_AS         FALSE      FALSE
## PCT_FI         FALSE      FALSE
## PCT_HI         FALSE      FALSE
## PCT_PI         FALSE      FALSE
## PCT_WH         FALSE      FALSE
## MEALS          FALSE      FALSE
## P_FDAY         FALSE      FALSE
## P_GATE         FALSE      FALSE
## P_MIGED        FALSE      FALSE
## P_EL           FALSE      FALSE
## P_RFEP         FALSE      FALSE
## P_DI           FALSE      FALSE
## SMOB           FALSE      FALSE
## CBMOB          FALSE      FALSE
## DMOB           FALSE      FALSE
## PCT_RESP       FALSE      FALSE
## NOT_HSG        FALSE      FALSE
## HSG            FALSE      FALSE
## SOME_COL       FALSE      FALSE
## COL_GRAD       FALSE      FALSE
## GRAD_SCH       FALSE      FALSE
## AVG_ED         FALSE      FALSE
## FULL           FALSE      FALSE
## EMER           FALSE      FALSE
## PEN_2          FALSE      FALSE
## PEN_35         FALSE      FALSE
## PEN_6          FALSE      FALSE
## PEN_78         FALSE      FALSE
## PEN_911        FALSE      FALSE
## ENROLL         FALSE      FALSE
## PARENT_OPT     FALSE      FALSE
## TESTED         FALSE      FALSE
## VCST_E28       FALSE      FALSE
## VCST_E911      FALSE      FALSE
## CW_CSTE        FALSE      FALSE
## VCST_M28       FALSE      FALSE
## VCST_M911      FALSE      FALSE
## CW_CSTM        FALSE      FALSE
## VCST_S28       FALSE      FALSE
## VCST_S911      FALSE      FALSE
## CW_CSTS        FALSE      FALSE
## VCST_H28       FALSE      FALSE
## VCST_H911      FALSE      FALSE
## CW_CSTH        FALSE      FALSE
## VNRT_R28       FALSE      FALSE
## CW_NRTR        FALSE      FALSE
## VNRT_L28       FALSE      FALSE
## CW_NRTL        FALSE      FALSE
## VNRT_S28       FALSE      FALSE
## CW_NRTS        FALSE      FALSE
## VNRT_M28       FALSE      FALSE
## CW_NRTM        FALSE      FALSE
## VCHS_E911      FALSE      FALSE
## CW_CHSE        FALSE      FALSE
## VCHS_M911      FALSE      FALSE
## CW_CHSM        FALSE      FALSE
## PCST_E28       FALSE      FALSE
## PCST_E911      FALSE      FALSE
## PCST_M28       FALSE      FALSE
## PCST_M911      FALSE      FALSE
## PCST_S28       FALSE      FALSE
## PCST_S911      FALSE      FALSE
## PCST_H28       FALSE      FALSE
## PCST_H911      FALSE      FALSE
## PNRT_R28       FALSE      FALSE
## PNRT_L28       FALSE      FALSE
## PNRT_S28       FALSE      FALSE
## PNRT_M28       FALSE      FALSE
## PCHS_E911      FALSE      FALSE
## PCHS_M911      FALSE      FALSE
## TOT_28         FALSE      FALSE
## TOT_911        FALSE      FALSE
## 1 subsets of each size up to 11
## Selection Algorithm: backward
##           STYPE2 STYPE3 STYPE4 STYPEE STYPEH STYPEM VALID_NUM AA_NUM
## 1  ( 1 )  " "    " "    " "    " "    " "    " "    " "       " "   
## 2  ( 1 )  " "    " "    " "    " "    " "    " "    " "       " "   
## 3  ( 1 )  " "    " "    " "    " "    " "    " "    " "       " "   
## 4  ( 1 )  " "    " "    " "    " "    " "    " "    " "       " "   
## 5  ( 1 )  " "    " "    " "    " "    " "    " "    " "       " "   
## 6  ( 1 )  " "    " "    " "    " "    " "    " "    " "       " "   
## 7  ( 1 )  " "    " "    " "    " "    " "    " "    " "       " "   
## 8  ( 1 )  " "    " "    " "    " "    " "    " "    " "       " "   
## 9  ( 1 )  " "    " "    " "    " "    " "    " "    " "       " "   
## 10  ( 1 ) " "    " "    " "    " "    " "    " "    " "       " "   
## 11  ( 1 ) " "    " "    " "    " "    " "    " "    " "       " "   
##           AA_SIGYes AI_NUM AI_SIGYes AS_NUM AS_SIGYes FI_NUM FI_SIGYes
## 1  ( 1 )  " "       " "    " "       " "    " "       " "    " "      
## 2  ( 1 )  " "       " "    " "       " "    " "       " "    " "      
## 3  ( 1 )  " "       " "    " "       " "    " "       " "    " "      
## 4  ( 1 )  " "       " "    " "       " "    " "       " "    " "      
## 5  ( 1 )  " "       " "    " "       " "    " "       " "    " "      
## 6  ( 1 )  " "       " "    " "       " "    " "       " "    " "      
## 7  ( 1 )  " "       " "    " "       " "    " "       " "    " "      
## 8  ( 1 )  " "       " "    " "       " "    " "       " "    " "      
## 9  ( 1 )  " "       " "    " "       " "    " "       " "    " "      
## 10  ( 1 ) " "       " "    " "       " "    " "       " "    " "      
## 11  ( 1 ) " "       " "    " "       " "    " "       " "    " "      
##           HI_NUM HI_SIGYes PI_NUM PI_SIGYes WH_NUM WH_SIGYes SD_NUM
## 1  ( 1 )  " "    " "       " "    " "       " "    " "       " "   
## 2  ( 1 )  " "    " "       " "    " "       " "    " "       " "   
## 3  ( 1 )  " "    " "       " "    " "       " "    " "       " "   
## 4  ( 1 )  " "    " "       " "    " "       " "    " "       " "   
## 5  ( 1 )  " "    " "       " "    " "       " "    " "       " "   
## 6  ( 1 )  " "    " "       " "    " "       " "    " "       " "   
## 7  ( 1 )  " "    " "       " "    " "       " "    " "       " "   
## 8  ( 1 )  " "    " "       " "    " "       " "    " "       " "   
## 9  ( 1 )  " "    " "       " "    " "       " "    " "       " "   
## 10  ( 1 ) " "    " "       " "    " "       " "    " "       " "   
## 11  ( 1 ) " "    " "       " "    " "       " "    " "       " "   
##           SD_SIGYes EL_NUM EL_SIGYes DI_NUM DI_SIGYes PCT_AA PCT_AI PCT_AS
## 1  ( 1 )  " "       " "    " "       " "    " "       " "    " "    " "   
## 2  ( 1 )  " "       " "    " "       " "    " "       " "    " "    " "   
## 3  ( 1 )  " "       " "    " "       " "    " "       " "    " "    " "   
## 4  ( 1 )  " "       " "    " "       " "    " "       " "    " "    " "   
## 5  ( 1 )  " "       " "    " "       " "    " "       " "    " "    "*"   
## 6  ( 1 )  " "       " "    " "       " "    " "       " "    " "    "*"   
## 7  ( 1 )  " "       " "    " "       " "    " "       " "    " "    "*"   
## 8  ( 1 )  " "       " "    " "       " "    " "       " "    " "    "*"   
## 9  ( 1 )  " "       " "    " "       " "    " "       " "    " "    "*"   
## 10  ( 1 ) " "       " "    " "       " "    " "       " "    " "    "*"   
## 11  ( 1 ) " "       " "    " "       " "    " "       " "    " "    "*"   
##           PCT_FI PCT_HI PCT_PI PCT_WH MEALS P_FDAY P_GATE P_MIGED P_EL
## 1  ( 1 )  " "    " "    " "    " "    " "   " "    " "    " "     " " 
## 2  ( 1 )  " "    " "    " "    " "    "*"   " "    " "    " "     " " 
## 3  ( 1 )  " "    " "    " "    " "    "*"   " "    " "    " "     " " 
## 4  ( 1 )  " "    " "    " "    " "    "*"   " "    " "    " "     " " 
## 5  ( 1 )  " "    " "    " "    " "    "*"   " "    " "    " "     " " 
## 6  ( 1 )  " "    " "    " "    "*"    "*"   " "    " "    " "     " " 
## 7  ( 1 )  " "    " "    " "    "*"    "*"   " "    " "    " "     " " 
## 8  ( 1 )  " "    " "    " "    "*"    "*"   " "    " "    " "     " " 
## 9  ( 1 )  " "    " "    " "    "*"    "*"   " "    "*"    " "     " " 
## 10  ( 1 ) " "    " "    " "    "*"    "*"   " "    "*"    " "     " " 
## 11  ( 1 ) " "    " "    " "    "*"    "*"   " "    "*"    " "     " " 
##           P_RFEP P_DI SMOB CBMOB DMOB PCT_RESP NOT_HSG HSG SOME_COL
## 1  ( 1 )  " "    " "  "*"  " "   " "  " "      " "     " " " "     
## 2  ( 1 )  " "    " "  "*"  " "   " "  " "      " "     " " " "     
## 3  ( 1 )  " "    " "  "*"  " "   " "  " "      " "     " " " "     
## 4  ( 1 )  " "    " "  "*"  " "   " "  " "      " "     " " " "     
## 5  ( 1 )  " "    " "  "*"  " "   " "  " "      " "     " " " "     
## 6  ( 1 )  " "    " "  "*"  " "   " "  " "      " "     " " " "     
## 7  ( 1 )  " "    "*"  "*"  " "   " "  " "      " "     " " " "     
## 8  ( 1 )  " "    "*"  "*"  " "   " "  " "      " "     " " " "     
## 9  ( 1 )  " "    "*"  "*"  " "   " "  " "      " "     " " " "     
## 10  ( 1 ) " "    "*"  "*"  " "   " "  " "      " "     " " " "     
## 11  ( 1 ) " "    "*"  "*"  " "   " "  " "      "*"     " " " "     
##           COL_GRAD GRAD_SCH AVG_ED FULL EMER PEN_2 PEN_35 PEN_6 PEN_78
## 1  ( 1 )  " "      " "      " "    " "  " "  " "   " "    " "   " "   
## 2  ( 1 )  " "      " "      " "    " "  " "  " "   " "    " "   " "   
## 3  ( 1 )  " "      " "      " "    " "  " "  " "   " "    " "   " "   
## 4  ( 1 )  " "      "*"      " "    " "  " "  " "   " "    " "   " "   
## 5  ( 1 )  " "      "*"      " "    " "  " "  " "   " "    " "   " "   
## 6  ( 1 )  " "      "*"      " "    " "  " "  " "   " "    " "   " "   
## 7  ( 1 )  " "      "*"      " "    " "  " "  " "   " "    " "   " "   
## 8  ( 1 )  " "      "*"      " "    " "  " "  " "   " "    " "   " "   
## 9  ( 1 )  " "      "*"      " "    " "  " "  " "   " "    " "   " "   
## 10  ( 1 ) " "      "*"      " "    " "  " "  " "   " "    "*"   " "   
## 11  ( 1 ) " "      "*"      " "    " "  " "  " "   " "    "*"   " "   
##           PEN_911 ENROLL PARENT_OPT TESTED VCST_E28 PCST_E28 VCST_E911
## 1  ( 1 )  " "     " "    " "        " "    " "      " "      " "      
## 2  ( 1 )  " "     " "    " "        " "    " "      " "      " "      
## 3  ( 1 )  " "     " "    " "        " "    " "      " "      " "      
## 4  ( 1 )  " "     " "    " "        " "    " "      " "      " "      
## 5  ( 1 )  " "     " "    " "        " "    " "      " "      " "      
## 6  ( 1 )  " "     " "    " "        " "    " "      " "      " "      
## 7  ( 1 )  " "     " "    " "        " "    " "      " "      " "      
## 8  ( 1 )  " "     " "    " "        " "    " "      " "      " "      
## 9  ( 1 )  " "     " "    " "        " "    " "      " "      " "      
## 10  ( 1 ) " "     " "    " "        " "    " "      " "      " "      
## 11  ( 1 ) " "     " "    " "        " "    " "      " "      " "      
##           PCST_E911 CW_CSTE VCST_M28 PCST_M28 VCST_M911 PCST_M911 CW_CSTM
## 1  ( 1 )  " "       " "     " "      " "      " "       " "       " "    
## 2  ( 1 )  " "       " "     " "      " "      " "       " "       " "    
## 3  ( 1 )  " "       "*"     " "      " "      " "       " "       " "    
## 4  ( 1 )  " "       "*"     " "      " "      " "       " "       " "    
## 5  ( 1 )  " "       "*"     " "      " "      " "       " "       " "    
## 6  ( 1 )  " "       "*"     " "      " "      " "       " "       " "    
## 7  ( 1 )  " "       "*"     " "      " "      " "       " "       " "    
## 8  ( 1 )  " "       "*"     " "      " "      " "       " "       " "    
## 9  ( 1 )  " "       "*"     " "      " "      " "       " "       " "    
## 10  ( 1 ) " "       "*"     " "      " "      " "       " "       " "    
## 11  ( 1 ) " "       "*"     " "      " "      " "       " "       " "    
##           VCST_S28 PCST_S28 VCST_S911 PCST_S911 CW_CSTS VCST_H28 PCST_H28
## 1  ( 1 )  " "      " "      " "       " "       " "     " "      " "     
## 2  ( 1 )  " "      " "      " "       " "       " "     " "      " "     
## 3  ( 1 )  " "      " "      " "       " "       " "     " "      " "     
## 4  ( 1 )  " "      " "      " "       " "       " "     " "      " "     
## 5  ( 1 )  " "      " "      " "       " "       " "     " "      " "     
## 6  ( 1 )  " "      " "      " "       " "       " "     " "      " "     
## 7  ( 1 )  " "      " "      " "       " "       " "     " "      " "     
## 8  ( 1 )  " "      " "      " "       " "       " "     " "      " "     
## 9  ( 1 )  " "      " "      " "       " "       " "     " "      " "     
## 10  ( 1 ) " "      " "      " "       " "       " "     " "      " "     
## 11  ( 1 ) " "      " "      " "       " "       " "     " "      " "     
##           VCST_H911 PCST_H911 CW_CSTH VNRT_R28 PNRT_R28 CW_NRTR VNRT_L28
## 1  ( 1 )  " "       " "       " "     " "      " "      " "     " "     
## 2  ( 1 )  " "       " "       " "     " "      " "      " "     " "     
## 3  ( 1 )  " "       " "       " "     " "      " "      " "     " "     
## 4  ( 1 )  " "       " "       " "     " "      " "      " "     " "     
## 5  ( 1 )  " "       " "       " "     " "      " "      " "     " "     
## 6  ( 1 )  " "       " "       " "     " "      " "      " "     " "     
## 7  ( 1 )  " "       " "       " "     " "      " "      " "     " "     
## 8  ( 1 )  " "       " "       " "     " "      " "      " "     " "     
## 9  ( 1 )  " "       " "       " "     " "      " "      " "     " "     
## 10  ( 1 ) " "       " "       " "     " "      " "      " "     " "     
## 11  ( 1 ) " "       " "       " "     " "      " "      " "     " "     
##           PNRT_L28 CW_NRTL VNRT_S28 PNRT_S28 CW_NRTS VNRT_M28 PNRT_M28
## 1  ( 1 )  " "      " "     " "      " "      " "     " "      " "     
## 2  ( 1 )  " "      " "     " "      " "      " "     " "      " "     
## 3  ( 1 )  " "      " "     " "      " "      " "     " "      " "     
## 4  ( 1 )  " "      " "     " "      " "      " "     " "      " "     
## 5  ( 1 )  " "      " "     " "      " "      " "     " "      " "     
## 6  ( 1 )  " "      " "     " "      " "      " "     " "      " "     
## 7  ( 1 )  " "      " "     " "      " "      " "     " "      " "     
## 8  ( 1 )  " "      " "     " "      " "      " "     " "      " "     
## 9  ( 1 )  " "      " "     " "      " "      " "     " "      " "     
## 10  ( 1 ) " "      " "     " "      " "      " "     " "      " "     
## 11  ( 1 ) " "      " "     " "      " "      " "     " "      " "     
##           CW_NRTM VCHS_E911 PCHS_E911 CW_CHSE VCHS_M911 PCHS_M911 CW_CHSM
## 1  ( 1 )  " "     " "       " "       " "     " "       " "       " "    
## 2  ( 1 )  " "     " "       " "       " "     " "       " "       " "    
## 3  ( 1 )  " "     " "       " "       " "     " "       " "       " "    
## 4  ( 1 )  " "     " "       " "       " "     " "       " "       " "    
## 5  ( 1 )  " "     " "       " "       " "     " "       " "       " "    
## 6  ( 1 )  " "     " "       " "       " "     " "       " "       " "    
## 7  ( 1 )  " "     " "       " "       " "     " "       " "       " "    
## 8  ( 1 )  " "     " "       " "       " "     " "       " "       "*"    
## 9  ( 1 )  " "     " "       " "       " "     " "       " "       "*"    
## 10  ( 1 ) " "     " "       " "       " "     " "       " "       "*"    
## 11  ( 1 ) " "     " "       " "       " "     " "       " "       "*"    
##           TOT_28 TOT_911
## 1  ( 1 )  " "    " "    
## 2  ( 1 )  " "    " "    
## 3  ( 1 )  " "    " "    
## 4  ( 1 )  " "    " "    
## 5  ( 1 )  " "    " "    
## 6  ( 1 )  " "    " "    
## 7  ( 1 )  " "    " "    
## 8  ( 1 )  " "    " "    
## 9  ( 1 )  " "    " "    
## 10  ( 1 ) " "    " "    
## 11  ( 1 ) " "    " "
#we are going to see the changes of R^2, Adjusted R^2 (how well the model explained), Residuals Sum of Sqaured (Residuals), Cp (size of the bias), BIC 

#R Squared
plot(reg.summary$rsq, xlab="number of variables", ylab="Rsquared", type="l")
max(reg.summary$rsq)
## [1] 0.8201761
which.max(reg.summary$rsq)
## [1] 11
points(3, reg.summary$rsq[3], col="red",cex=1,pch=20)
points(4, reg.summary$rsq[4], col="red",cex=1,pch=20)

#As the graph shows, we can notice a significant change of R^2 at 3 and 4 of number of variables
#After that, we see the graph is slightly increasing 
#I assume that those points are the critical points.  
#Residuals Sum of Squared
plot(reg.summary$rss, xlab="number of variables", ylab="RSS", type="l")
min(reg.summary$rss)
## [1] 15558564
which.min(reg.summary$rss)
## [1] 11
points(3, reg.summary$rss[3], col="red",cex=1,pch=20)
points(4, reg.summary$rss[4], col="red",cex=1,pch=20)

#Adjusted R Squared
plot(reg.summary$adjr2, xlab="number of variables", ylab="Adjusted R^2", type="l")
max(reg.summary$adjr2)
## [1] 0.8198789
which.max(reg.summary$adjr2)
## [1] 11
points(3,reg.summary$adjr2[3],col="red",cex=1,pch=20)
points(4,reg.summary$adjr2[4],col="red",cex=1,pch=20)

#CP
#From Penn State Univ Stat online Course..
#"An underspecified model is a model in which important predictors are missing. And, an underspecified model yields biased regression coefficients and biased predictions of the response. Well, in short, Mallows' CP statistic estimates the size of the bias that is introduced into the predicted responses by having an underspecified model."
#In short, CP implies the size of the bias

plot(reg.summary$cp, xlab="number of variables", ylab="Cp", type="l")
min(reg.summary$cp)
## [1] 982.5362
which.min(reg.summary$cp)
## [1] 11
points(3, reg.summary$cp[3],col="red",cex=1,pch=20)
points(4, reg.summary$cp[4],col="red",cex=1,pch=20)

#BIC
plot(reg.summary$bic, xlab="number of variables", ylab="BIC", type="l")
min(reg.summary$bic)
## [1] -11336.85
which.min(reg.summary$bic)
## [1] 11
points(3, reg.summary$bic[3],col="red",cex=1,pch=20)
points(4, reg.summary$bic[4],col="red",cex=1,pch=20)

#We would try to choose 4 number of predictors for our model first. 
a<-names(coef(reg1,which(reg.summary$bic==reg.summary$bic[4])))

reg.summary$rsq[4]
## [1] 0.7507709
reg.summary$adjr2[4]
## [1] 0.7506213
training.newbase1<-select(training.newbase,c("API05B",a[2:5]))
rownames(training.newbase1)<-1:nrow(training.newbase1)

#plots between response and predictors
plot(training.newbase1)

#GRAD_SCH with others shows some curves, it looks like log graph
#VCST_M911 seems to be too skewed
cor<-cor(training.newbase1)
corrplot(cor, 
         method=c("circle"),
         title="Correlation between variables",
         type=c("full"))

#too less correlation between VCST_M911 and any others. 
training.newbase1 %>% keep(is.numeric) %>% gather() %>%
  ggplot(aes(value)) +
  facet_wrap(~key, scales="free")+
  geom_histogram(bins=30)+
  geom_density()

#we might want to exclude the VCST_M911, but let's see how the model would be
lm1<-lm(API05B~., data=training.newbase1)
summary(lm1)
## 
## Call:
## lm(formula = API05B ~ ., data = training.newbase1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -524.01  -34.74    4.39   39.43  371.32 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.968e+02  2.692e+00 296.042  < 2e-16 ***
## MEALS       -1.171e+00  3.419e-02 -34.265  < 2e-16 ***
## SMOB        -4.205e+00  5.567e-02 -75.543  < 2e-16 ***
## GRAD_SCH     2.652e+00  8.543e-02  31.046  < 2e-16 ***
## VCST_M911   -3.057e-03  4.015e-04  -7.615 3.01e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 63.99 on 6664 degrees of freedom
## Multiple R-squared:  0.6846, Adjusted R-squared:  0.6844 
## F-statistic:  3616 on 4 and 6664 DF,  p-value: < 2.2e-16
plot(lm1)

#adj R^2 68%, and there looks some patterns and some outliers 
a<-predict(lm1,newdata=test.newbase)
b<-test.newbase$API05B


plot(a,b)

#predicted values vs test values of API05B
#looks like linear, and some outliers
predictions1 <- lm1 %>% predict(test.newbase)
# Model performance
data.frame(
  MSE = mse(test.newbase$API05B, predictions1),
  RMSE = RMSE(predictions1, test.newbase$API05B),
  R2 = R2(predictions1, test.newbase$API05B)
)
##        MSE     RMSE        R2
## 1 4116.149 64.15722 0.6809287
#acceptable MSE, 68% of R^2 (68% of the variance explained)
#########Outliers - Cooks Distance

#finding outliers in our model

#There are several ways to see outliers in our model
outlierTest(lm1,n.max=50,cutoff=0.05)
##       rstudent unadjusted p-value Bonferonni p
## 4152 10.797847         5.8846e-27   3.9244e-23
## 3762 -8.231389         2.2069e-16   1.4718e-12
## 1452 -5.671927         1.4712e-08   9.8116e-05
## 4680 -5.128051         3.0102e-07   2.0075e-03
## 4812 -5.113673         3.2477e-07   2.1659e-03
## 1356 -5.102589         3.4431e-07   2.2962e-03
## 743  -4.993868         6.0687e-07   4.0472e-03
## 2258  4.965982         7.0055e-07   4.6720e-03
## 2700 -4.911503         9.2536e-07   6.1712e-03
## 3364 -4.791230         1.6935e-06   1.1294e-02
## 3944 -4.708948         2.5404e-06   1.6942e-02
## 5605 -4.590720         4.4982e-06   2.9999e-02
## 1465 -4.580471         4.7237e-06   3.1502e-02
## 3924 -4.567243         5.0307e-06   3.3550e-02
## 488  -4.555222         5.3262e-06   3.5521e-02
qqPlot(lm1,main="QQ Plot")

## [1] 3762 4152
avPlots(lm1)

influencePlot(lm1,id.method="identify",main="influential plot",sub="circle size is proportial to cook's distance")
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is
## not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is
## not a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter

##         StudRes          Hat        CookD
## 1797  0.9614079 0.0228222701  0.004317532
## 3762 -8.2313890 0.0004394439  0.005898503
## 4152 10.7978467 0.7062204348 55.100348069
## 5582 -3.5486831 0.0116954299  0.029753287
#However,
#I'm using Standardized Residuals to detect outliers

#from Springers Text book "Linear Regression"

#"In summary, an outlier is a point whose standardized residual falls outside the interval from -2 to 2. Recall that a bad leverage point is a leverage point which is also an outlier. Thus, a bad leverage point is a leverage point whose standardized residual falls outside the interval from -2 to 2."

o1<-which(rstandard(lm1, infl = lm.influence(lm1, do.coef = FALSE),
                    sd=sqrt(deviance(lm1)/df.residual(lm1)),
                    type=c("sd.1","predictive"))>4)

o2<-which(rstandard(lm1, infl = lm.influence(lm1, do.coef = FALSE),
                    sd=sqrt(deviance(lm1)/df.residual(lm1)),
                    type=c("sd.1","predictive"))<(-4))


outliers <- c(o1,o2)
length(outliers)
## [1] 26
#####The data set after removing outliers
training.newbase2<-training.newbase1[-outliers,]

#investigating linear models between response and each predictors
lm.test1<-lm(API05B~MEALS, data=training.newbase2)
summary(lm.test1)
## 
## Call:
## lm(formula = API05B ~ MEALS, data = training.newbase2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -459.73  -32.82   18.13   58.80  260.49 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 821.0938     2.2436  365.98   <2e-16 ***
## MEALS        -2.0261     0.0384  -52.76   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 94.57 on 6641 degrees of freedom
## Multiple R-squared:  0.2954, Adjusted R-squared:  0.2953 
## F-statistic:  2784 on 1 and 6641 DF,  p-value: < 2.2e-16
plot(lm.test1)

#adj R^2 29.5%, plots looks ok except for normal Q-Q plot
lm.test2<-lm(API05B~SMOB, data=training.newbase2)
summary(lm.test2)
## 
## Call:
## lm(formula = API05B ~ SMOB, data = training.newbase2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -323.52  -60.06   -2.04   60.88  349.29 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 773.80476    1.33624  579.09   <2e-16 ***
## SMOB         -4.92018    0.07328  -67.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 86.95 on 6641 degrees of freedom
## Multiple R-squared:  0.4044, Adjusted R-squared:  0.4043 
## F-statistic:  4508 on 1 and 6641 DF,  p-value: < 2.2e-16
plot(lm.test2)

#adj R^2 40%, seems heteroscedasticity
lm.test3<-lm(API05B~GRAD_SCH, data=training.newbase2)
summary(lm.test3)
## 
## Call:
## lm(formula = API05B ~ GRAD_SCH, data = training.newbase2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -422.22  -38.53   11.09   56.84  288.16 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 660.47007    1.40685  469.47   <2e-16 ***
## GRAD_SCH      5.68746    0.08678   65.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 87.79 on 6641 degrees of freedom
## Multiple R-squared:  0.3928, Adjusted R-squared:  0.3927 
## F-statistic:  4296 on 1 and 6641 DF,  p-value: < 2.2e-16
plot(lm.test3)

#R^2 39%, plots shows extreme heteroscedasticity
lm.test4<-lm(API05B~VCST_M911, data=training.newbase2)
summary(lm.test4)
## 
## Call:
## lm(formula = API05B ~ VCST_M911, data = training.newbase2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -448.63  -63.67    4.33   76.33  277.33 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 720.673180   1.420456 507.353  < 2e-16 ***
## VCST_M911    -0.003498   0.001295  -2.701  0.00692 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 112.6 on 6641 degrees of freedom
## Multiple R-squared:  0.001098,   Adjusted R-squared:  0.0009472 
## F-statistic: 7.297 on 1 and 6641 DF,  p-value: 0.006924
plot(lm.test4)

#R^2 0.1%, plots shows extreme patterns showing heteroscedasticity
#####Model after removing outliers
lm2<-lm(API05B~., data=training.newbase2)
summary(lm2)
## 
## Call:
## lm(formula = API05B ~ ., data = training.newbase2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -252.496  -34.327    3.441   37.821  248.200 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.044e+02  2.590e+00  310.63   <2e-16 ***
## MEALS       -1.239e+00  3.266e-02  -37.94   <2e-16 ***
## SMOB        -4.259e+00  5.313e-02  -80.16   <2e-16 ***
## GRAD_SCH     2.555e+00  8.143e-02   31.38   <2e-16 ***
## VCST_M911   -9.973e-03  7.023e-04  -14.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 60.66 on 6638 degrees of freedom
## Multiple R-squared:  0.7102, Adjusted R-squared:   0.71 
## F-statistic:  4067 on 4 and 6638 DF,  p-value: < 2.2e-16
plot(lm2)

#adj R^2 71%, plots looks ok except for showing some pattern of the heteroscedasticity. 
#I decide to drop the VCST_M911

#R^2 getting higher, plots for model such as 'fitted values vs residuals', 'Normal Q-Q', or 'sqrt of Standardized residuals vs fitted values' becomes much better after removing outliers

training.newbase3<-select(training.newbase2, -c("VCST_M911"))

lm3<-lm(API05B~., data=training.newbase3)
summary(lm3)
## 
## Call:
## lm(formula = API05B ~ ., data = training.newbase3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -250.273  -36.079    3.853   39.130  246.257 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 798.20643    2.59092  308.08   <2e-16 ***
## MEALS        -1.19113    0.03297  -36.13   <2e-16 ***
## SMOB         -4.21369    0.05383  -78.27   <2e-16 ***
## GRAD_SCH      2.62762    0.08249   31.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61.57 on 6639 degrees of freedom
## Multiple R-squared:  0.7014, Adjusted R-squared:  0.7013 
## F-statistic:  5198 on 3 and 6639 DF,  p-value: < 2.2e-16
plot(lm3)

#still showing some patterns, I will perform diagnosis of normality of residuals, multicollinearity, and heteroscedasticity from now. 
a1<-predict(lm3,newdata = test.newbase)

predictions2 <- lm3 %>% predict(test.newbase)
# Model performance
data.frame(
  MSE = mse(test.newbase$API05B, predictions2),
  RMSE = RMSE(predictions2, test.newbase$API05B),
  R2 = R2(predictions2, test.newbase$API05B)
)
##        MSE     RMSE        R2
## 1 4185.408 64.69473 0.6756477
#########Normality (of residuals)

#We don't have to worry about the assumption of normality of residuals (the error between the dependent variable and the independent variables) because when the sample size is sufficiently large, the Central Limit Theorem ensures that the distribution of residiual will be approximately normality. 


qplot(residuals(lm3),
      geom="histogram",
      binwidth= 10,
      main="Histogram of Residuals of our model",
      xlab="Residuals of our model",
      ylab="Frequency")

#Residuals on our model are approximately normal distributed. 
##########Multicolliniearity

# From STHDA website.. 

#definition

##In multiple regression , two or more predictor variables might be correlated with each other. This situation is referred as collinearity.

#There is an extreme situation, called multicollinearity, where collinearity exists between three or more variables even if no pair of variables has a particularly high correlation. This means that there is redundancy between predictor variables.

vif(lm3)
##    MEALS     SMOB GRAD_SCH 
## 1.738969 1.076189 1.837037
#More than 4 VIF score must be removed, but we dont have any.
##########heteroscedasciticity (non-constant residuals)

plot(lm3)

#As we see the first plot of the model (fitted value vs residuals), 
#it seems like there's heteroscedasticity exists, so I'm going to perform transformation. 
#box-cox


bc<-boxcox(lm3)

lambda<-bc$x[which.max(bc$y)]
lambda
## [1] 2
#boxcox + log transformation
lm4<-lm((API05B)^(lambda)~log(MEALS+1)+log(SMOB+1)+log(GRAD_SCH+1), data=training.newbase3)
summary(lm4)
## 
## Call:
## lm(formula = (API05B)^(lambda) ~ log(MEALS + 1) + log(SMOB + 
##     1) + log(GRAD_SCH + 1), data = training.newbase3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -510833  -59447    3296   62040  372286 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         754738       8669   87.06   <2e-16 ***
## log(MEALS + 1)      -36637       1494  -24.52   <2e-16 ***
## log(SMOB + 1)       -90148       1529  -58.95   <2e-16 ***
## log(GRAD_SCH + 1)    49966       1540   32.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 92650 on 6639 degrees of freedom
## Multiple R-squared:  0.6506, Adjusted R-squared:  0.6505 
## F-statistic:  4122 on 3 and 6639 DF,  p-value: < 2.2e-16
plot(lm4)

#It looks improved. 
a2<-sqrt(predict(lm4,newdata = test.newbase))

#Model Progress as graph

plot(a,b,
     main="First model prediction",
     xlab="predicted values",
     ylab="values in test dataset")
abline(a=0,b=1,col="red")

plot(a1,b,
     main="Removed outliers",
     xlab="predicted values",
     ylab="values in test dataset")
abline(a=0,b=1,col="red")

plot(a2,b,
     main="After transformation - Final model",
     xlab="predicted values",
     ylab="values in test dataset")
abline(a=0,b=1,col="red")

predictions3 <- lm4 %>% predict(test.newbase)

# Model performance
data.frame(
  MSE = mse(test.newbase$API05B, predictions3),
  RMSE = RMSE(predictions3, test.newbase$API05B),
  R2 = R2(predictions3, test.newbase$API05B)
)
##            MSE     RMSE        R2
## 1 297400280996 545344.2 0.6135713
#model performance changing process

c.m<-data.frame(MSE=
                  c(mse(test.newbase$API05B, predictions1),
                    mse(test.newbase$API05B, predictions2),
                    mse(test.newbase$API05B, predictions3)
                  ),
                RMSE=
                  c(RMSE(predictions1,test.newbase$API05B),
                    RMSE(predictions2,test.newbase$API05B),
                    RMSE(predictions3,test.newbase$API05B)
                  ),
                R2=
                  c(R2(predictions1,test.newbase$API05B),
                    R2(predictions2,test.newbase$API05B),
                    R2(predictions3,test.newbase$API05B)
                  ))


rownames(c.m)<-c("First model","Removed Outliers","After transformation-final model")

c.m
##                                           MSE         RMSE        R2
## First model                      4.116149e+03     64.15722 0.6809287
## Removed Outliers                 4.185408e+03     64.69473 0.6756477
## After transformation-final model 2.974003e+11 545344.18581 0.6135713
summary(lm4)
## 
## Call:
## lm(formula = (API05B)^(lambda) ~ log(MEALS + 1) + log(SMOB + 
##     1) + log(GRAD_SCH + 1), data = training.newbase3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -510833  -59447    3296   62040  372286 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         754738       8669   87.06   <2e-16 ***
## log(MEALS + 1)      -36637       1494  -24.52   <2e-16 ***
## log(SMOB + 1)       -90148       1529  -58.95   <2e-16 ***
## log(GRAD_SCH + 1)    49966       1540   32.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 92650 on 6639 degrees of freedom
## Multiple R-squared:  0.6506, Adjusted R-squared:  0.6505 
## F-statistic:  4122 on 3 and 6639 DF,  p-value: < 2.2e-16
#Conclusion

#I conclude that we are able to predict the API score for the future with the 3 of predictors, which are described at below, as 65% variance explained. 

#As interpretation of the Betas (each coefficients), 
#For example, 
#for a 10% increase in GRAD_SCH, Beta3 * log(1.1+1) = 49966 * log(1.1+1) = 37071.64.
#API score is transformed with lambda, which is power to the 2, from box-cox transformation, 

#Therefore, sqrt(Beta3 * log(1.1+1)) = sqrt(49966 * log(1.1+1)) = 192.54 increase in API05B score
#In other words, 1 percent change in GRAD_SCH is associated with Beta3 * log(201/100) change in (API05B)^(lambda), where lambda is 2


#For a 1% increase in SMOB, abs(Beta1) * log(2.01) = 36637*log(2.01) = 25577.56 
#Therefore, sqrt(abs(Beta1)*log(2.01)) = sqrt(36637 * log(2.01)) = 159.92 decrease in API05B score
#In other words, 1 percent change in SMOB is associated with abs(Beta1) * log(201/100) change in (API05B)^(lambda), where lambda is 2